package tk.yourchanges.clicker.clicker; import java.io.File; import java.io.FileInputStream; import java.util.ArrayList; import java.util.HashSet; import java.util.Random; import java.util.Properties; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import org.apache.commons.lang.math.RandomUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.conn.params.ConnRoutePNames; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.util.EntityUtils; import org.eclipse.jetty.util.ConcurrentHashSet; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.openqa.selenium.*; import org.openqa.selenium.Proxy.ProxyType; import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.firefox.FirefoxDriver; import org.openqa.selenium.firefox.FirefoxProfile; import org.openqa.selenium.htmlunit.HtmlUnitDriver; import org.openqa.selenium.ie.InternetExplorerDriver; import org.openqa.selenium.remote.CapabilityType; import org.openqa.selenium.remote.DesiredCapabilities; import tk.yourchanges.clicker.proxy.CN88ProxyGetter; import tk.yourchanges.clicker.proxy.CNProxyGetter; import tk.yourchanges.clicker.proxy.Proxy; import tk.yourchanges.clicker.proxy.Runner; /** * * @author <a href="mailto:yourchanges@gmail.com">Yuanjun Li</a> * */ public class Clicker { private WebDriver driver; private ClickerConfig conf; private HashSet<String> urls; private File proxyConfFile; private ConcurrentHashSet<String> proxyServers = new ConcurrentHashSet<String>(); public String getPrxoyServer(){ if(this.proxyServers.size()==0){ autoFetchProxyServers(); } String r =""; for(String p:proxyServers){ r=p; break; } return r; } public void autoFetchProxyServers(){ Properties p = new Properties(); try { p.load(new FileInputStream(proxyConfFile)); } catch (Exception e) { e.printStackTrace(); return; } System.out.println("开始抓取代理服务器,列表如下:"); System.out.println(""); final Set<Proxy> proxies = new CNProxyGetter(p).find(); final Set<Proxy> proxies2 = new CN88ProxyGetter(p).find(); proxies.addAll(proxies2); // String url = "http://yourchanges.tk/blog"; ExecutorService eS = Executors.newFixedThreadPool(Integer.valueOf(p .getProperty("threadPoolSize", "100"))); for (final Proxy proxy : proxies) { eS.execute(new Runner(proxy, p.getProperty("validateURLs", "http://www.baidu.com"),this.proxyServers)); Utils.sleep(10); } } public Clicker(File confFile){ this.init(confFile); } private void init(File confFile){ String jsonString = Utils.readFile(confFile); conf = JsonMapper.buildNonEmptyMapper().fromJson(jsonString , ClickerConfig.class); if(conf.getProxyHostPortString().length==0){ conf.setUseProxy(false); } this.proxyConfFile = new File(confFile.getParentFile().getAbsolutePath()+File.separator+"proxy.ini"); if(conf.isAutoProxy()){ conf.setUseProxy(true); autoFetchProxyServers(); } try { urls = getArticleList(); } catch (Exception e) { e.printStackTrace(); urls.clear(); } if(urls.size()==0){ System.out.println("不能获取任何网页列表"); System.exit(-1); } if(urls.size()==0){ System.out.println("不能获取任何网页列表"); System.exit(-1); } } public Clicker(){ String file = this.getClass().getClassLoader().getResource("conf.json").getFile(); this.init(new File(file)); } /** * 默认启动基于第一个代理地址的WebDriver * * @throws Exception */ public void setUp() throws Exception { setUp(0); } /** * 启动基于第proxyIndex个代理地址的WebDriver * * @param proxyIndex * @throws Exception */ public void setUp(int proxyIndex) throws Exception { if("firefox".equalsIgnoreCase(conf.getBrowser())){ setUpFirefoxDriver(proxyIndex); } else if("ie".equalsIgnoreCase(conf.getBrowser())){ setUpIEDriver(proxyIndex); } else if("chrome".equalsIgnoreCase(conf.getBrowser())){ setUpChromeDriver(proxyIndex); } else if("htmlunit".equalsIgnoreCase(conf.getBrowser())){ setUpHtmlUnitDriver(proxyIndex); } else{ throw new RuntimeException("当前还不支持你指定的浏览器:" + conf.getBrowser()); } if(driver!=null){ driver.manage().timeouts().setScriptTimeout(conf.getJavascriptTimeout(), TimeUnit.SECONDS); //目前该参数不支持CHROME if(!"chrome".equalsIgnoreCase(conf.getBrowser())){ driver.manage().timeouts().pageLoadTimeout(conf.getPageLoadTimeout(), TimeUnit.SECONDS); } driver.manage().timeouts().implicitlyWait(conf.getElementSearchTimeout(), TimeUnit.SECONDS); } } private void setUpChromeDriver(int proxyIndex){ DesiredCapabilities capabilities = DesiredCapabilities.chrome(); if(conf.isUseProxy()){ String proxyString = ""; if(conf.isAutoProxy()){ proxyString = getPrxoyServer(); this.proxyServers.remove(proxyString); } else { proxyString = conf.getProxyHostPortString()[proxyIndex]; } System.out.println("正在使用代理:"+proxyString); org.openqa.selenium.Proxy proxy = new org.openqa.selenium.Proxy(); //proxy.setProxyType(ProxyType.MANUAL); proxy.setHttpProxy("http://" + proxyString); capabilities.setCapability(CapabilityType.PROXY, proxy); } driver = new ChromeDriver(capabilities); } private void setUpFirefoxDriver(int proxyIndex){ FirefoxProfile profile = new FirefoxProfile(); if(conf.isUseProxy()){ String proxyString = ""; if(conf.isAutoProxy()){ proxyString = getPrxoyServer(); this.proxyServers.remove(proxyString); } else { proxyString = conf.getProxyHostPortString()[proxyIndex]; } System.out.println("正在使用代理:"+proxyString); profile.setPreference("network.proxy.type", ProxyType.MANUAL.ordinal()); profile.setPreference("network.proxy.http", proxyString); profile.setPreference("network.proxy.http_port", proxyString); profile.setPreference("network.proxy.ssl", proxyString); profile.setPreference("network.proxy.ssl_port", proxyString); } driver = new FirefoxDriver(profile); } private void setUpIEDriver(int proxyIndex){ DesiredCapabilities capabilities = DesiredCapabilities.internetExplorer(); if(conf.isUseProxy()){ System.out.println("基于IE的代理配置尚不被支持. 请使用其他浏览器。"); conf.setUseProxy(false); /*org.openqa.selenium.Proxy proxy = new org.openqa.selenium.Proxy(); proxy.setProxyType(ProxyType.MANUAL); proxy.setHttpProxy(conf.getProxyHostPortString()[proxyIndex]); //proxy.setSslProxy(conf.getProxyHostPortString()[proxyIndex]); capabilities.setCapability(CapabilityType.PROXY, proxy);*/ } driver = new InternetExplorerDriver(capabilities); } private void setUpHtmlUnitDriver(int proxyIndex){ DesiredCapabilities capabilities = DesiredCapabilities.htmlUnit(); if(conf.isUseProxy()){ String proxyString = ""; if(conf.isAutoProxy()){ proxyString = getPrxoyServer(); this.proxyServers.remove(proxyString); } else { proxyString = conf.getProxyHostPortString()[proxyIndex]; } System.out.println("正在使用代理:"+proxyString); org.openqa.selenium.Proxy proxy = new org.openqa.selenium.Proxy(); //proxy.setProxyType(ProxyType.MANUAL); proxy.setHttpProxy(proxyString); capabilities.setCapability(CapabilityType.PROXY, proxy); capabilities.setBrowserName("firefox"); } driver = new HtmlUnitDriver(capabilities); ((HtmlUnitDriver)driver).setJavascriptEnabled(true); } public String getHtmlWithProxy(String url) { String html = ""; DefaultHttpClient httpclient = new DefaultHttpClient(); // 代理的设置 HttpHost proxy = new HttpHost(conf.getFetchArticleListProxyHost(), conf.getFetchArticleListProxyPort()); httpclient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); // 目标地址 HttpGet httpget = new HttpGet(url); try { // 执行 HttpResponse response = httpclient.execute(httpget); HttpEntity entity = response.getEntity(); if (entity != null) { html = EntityUtils.toString(entity); // System.out.println(html); } } catch (Exception e) { e.printStackTrace(); } finally { httpclient.getConnectionManager().shutdown(); } return html; } /** * 获取一个页面里的文章列表 * * @param set * @param url * @return * @throws Exception */ private HashSet<String> getArticleList(HashSet<String> set, String url) throws Exception { if(conf.getFetchArticleListProxyHost().trim().isEmpty()){ Connection conn = Jsoup.connect(url); Document doc= conn.get(); Elements elements= doc.select(conf.getArticleLinkXpath()); for(Element e :elements){ String u = e.attr("href"); if(!u.startsWith("http")){ u = url+"/"+u; } System.out.println(u); set.add(u); } } else { String html = getHtmlWithProxy(url); Document doc = Jsoup.parse(html); Elements elements= doc.select(conf.getArticleLinkXpath()); for(Element e :elements){ String u = e.attr("href"); if(!u.startsWith("http")){ u = url+"/"+u; } System.out.println(u); set.add(u); } } return set; } /** * 获取所有URL对应页面里的文章列表 * * @return * @throws Exception */ public HashSet<String> getArticleList(){ System.out.println("获取到下面文章链接:"); HashSet<String> set =new HashSet<String>(); for(String url:conf.getBaseUrl()){ try { set = getArticleList(set, url); } catch (Exception e) { e.printStackTrace(); set.clear(); } } //如果获取不到,再来一次 if(set.size()==0){ for(String url:conf.getBaseUrl()){ try { set = getArticleList(set, url); } catch (Exception e) { e.printStackTrace(); set.clear(); } } } //如果获取不到,再来一次 if(set.size()==0){ for(String url:conf.getBaseUrl()){ try { set = getArticleList(set, url); } catch (Exception e) { e.printStackTrace(); set.clear(); } } } return set; } /** * 先本地不用代理跑一次,然后每个代理跑一次 */ public void runAll(){ boolean flag = this.conf.isUseProxy(); runWithoutProxy(); this.conf.setUseProxy(flag); if(this.conf.isUseProxy()){ long max=conf.getProxyHostPortString().length; if(this.conf.isAutoProxy()){ max=Long.MAX_VALUE; } for(long i=0;i<max;i++){ runFlow((int)i); } } } /** * 本地不用代理跑一次 */ public void runWithoutProxy(){ this.conf.setUseProxy(false); runFlow(0); } private void runFlow(int i){ try{ this.setUp(i); this.run(); }catch(Exception e){ e.printStackTrace(); }finally{ try { this.tearDown(); } catch (Exception e) { e.printStackTrace(); } } } /** * 先进行setUp()<br/><br/> * 跑所有文章,如果处理的文章数大于ClickerConfig.getHandleArticleNumbers()就停止 * * @throws Exception */ public void run() throws Exception { int i=0; ArrayList<String> list = new ArrayList<String>(urls); for(;;){ if((i++)>=conf.getHandleArticleNumPerProxy()){ break; } //随机获取一个URL String url = list.get(RandomUtils.nextInt(list.size()-1)); System.out.println("处理:"+url); //先浏览一次,防止点击率过高 driver.get(url); //点进文章 //广告块 driver.get(url); //点进文章 Utils.sleep(500+new Random().nextInt(500)); //等待页面加载完毕 String source = driver.getPageSource(); if (source != null && source.contains("</body>")){ } else{ System.out.println("页面加载不了"); break; } //广告1..n for(String adXpath:conf.getAdXpath()){ try{ WebElement e = driver.findElement(By.xpath(adXpath)); String hrefString = e.getAttribute("href"); if(checkCanClick(hrefString)){ //System.out.println(hrefString); e.click(); } Utils.sleep(1000+new Random().nextInt(2000)); }catch(Exception e){ e.printStackTrace(); break; } } Utils.sleep(1000+new Random().nextInt(2000)); } } public boolean checkCanClick(String hrefString){ if(hrefString==null){ return false; } for(String d:conf.getNoClickADDomins()){ if(hrefString.contains(d)){ return false; } } return true; } /** * 关闭WebDriver * @throws Exception */ public void tearDown() throws Exception { Utils.sleep(conf.getAdShowTime());//广告载入,展示时间 if(driver != null){ driver.quit(); driver =null; } } /*private boolean isElementPresent(By by) { try { driver.findElement(by); return true; } catch (NoSuchElementException e) { return false; } }*/ }